Topic: COVID-19 New cases visualisation dashboard: Shiny app

covid = read.csv("owid-covid-data.csv")

# these are indices for each country (an index is just a collection of stocks)
# 'FileEncoding' just cleans the column encoding for this case

#sp500 = read.csv("SPY Historical Data.csv", fileEncoding = 'UTF-8-BOM') # This is for US
#TOPIX = read.csv("TOPIX Historical Data.csv", fileEncoding = 'UTF-8-BOM') # This is for Japan
ASX200 = read.csv("S&P_ASX 200 Historical Data.csv", fileEncoding = 'UTF-8-BOM') # This is for Australia
#NSEI = read.csv("Nifty 50 Historical Data.csv", fileEncoding = 'UTF-8-BOM') # This is for India
#SSEC = read.csv("Shanghai Composite Historical Data.csv", fileEncoding = 'UTF-8-BOM') # this is for China

colnames(covid)
##  [1] "iso_code"                                  
##  [2] "continent"                                 
##  [3] "location"                                  
##  [4] "date"                                      
##  [5] "total_cases"                               
##  [6] "new_cases"                                 
##  [7] "new_cases_smoothed"                        
##  [8] "total_deaths"                              
##  [9] "new_deaths"                                
## [10] "new_deaths_smoothed"                       
## [11] "total_cases_per_million"                   
## [12] "new_cases_per_million"                     
## [13] "new_cases_smoothed_per_million"            
## [14] "total_deaths_per_million"                  
## [15] "new_deaths_per_million"                    
## [16] "new_deaths_smoothed_per_million"           
## [17] "reproduction_rate"                         
## [18] "icu_patients"                              
## [19] "icu_patients_per_million"                  
## [20] "hosp_patients"                             
## [21] "hosp_patients_per_million"                 
## [22] "weekly_icu_admissions"                     
## [23] "weekly_icu_admissions_per_million"         
## [24] "weekly_hosp_admissions"                    
## [25] "weekly_hosp_admissions_per_million"        
## [26] "total_tests"                               
## [27] "new_tests"                                 
## [28] "total_tests_per_thousand"                  
## [29] "new_tests_per_thousand"                    
## [30] "new_tests_smoothed"                        
## [31] "new_tests_smoothed_per_thousand"           
## [32] "positive_rate"                             
## [33] "tests_per_case"                            
## [34] "tests_units"                               
## [35] "total_vaccinations"                        
## [36] "people_vaccinated"                         
## [37] "people_fully_vaccinated"                   
## [38] "total_boosters"                            
## [39] "new_vaccinations"                          
## [40] "new_vaccinations_smoothed"                 
## [41] "total_vaccinations_per_hundred"            
## [42] "people_vaccinated_per_hundred"             
## [43] "people_fully_vaccinated_per_hundred"       
## [44] "total_boosters_per_hundred"                
## [45] "new_vaccinations_smoothed_per_million"     
## [46] "new_people_vaccinated_smoothed"            
## [47] "new_people_vaccinated_smoothed_per_hundred"
## [48] "stringency_index"                          
## [49] "population"                                
## [50] "population_density"                        
## [51] "median_age"                                
## [52] "aged_65_older"                             
## [53] "aged_70_older"                             
## [54] "gdp_per_capita"                            
## [55] "extreme_poverty"                           
## [56] "cardiovasc_death_rate"                     
## [57] "diabetes_prevalence"                       
## [58] "female_smokers"                            
## [59] "male_smokers"                              
## [60] "handwashing_facilities"                    
## [61] "hospital_beds_per_thousand"                
## [62] "life_expectancy"                           
## [63] "human_development_index"                   
## [64] "excess_mortality_cumulative_absolute"      
## [65] "excess_mortality_cumulative"               
## [66] "excess_mortality"                          
## [67] "excess_mortality_cumulative_per_million"

data clean

#Dropping NA
covid_clean = covid %>% drop_na(new_cases, new_cases_smoothed, new_vaccinations, new_vaccinations_smoothed, new_vaccinations_smoothed_per_million, population, population_density, median_age, extreme_poverty, total_vaccinations, hospital_beds_per_thousand, human_development_index, new_deaths, new_tests, weekly_icu_admissions_per_million, weekly_icu_admissions)

#This code changes the negative case values in the data set to zero
covid_clean$new_cases[covid_clean$new_cases < 0] <- 0
#Dropping NA
covid_AUS <- covid %>% filter(location == "Australia")
covid_clean_AUS = covid_AUS %>% drop_na(new_cases, new_cases_smoothed, new_vaccinations, new_vaccinations_smoothed, new_vaccinations_smoothed_per_million, population, population_density, median_age, extreme_poverty, total_vaccinations, hospital_beds_per_thousand, human_development_index, new_deaths, new_tests, total_tests, total_cases)

#This code changes the negative case values in the data set to zero
covid_clean_AUS$new_cases[covid_clean_AUS$new_cases < 0] <- 0
view(covid_clean_AUS)

#glimpse(covid_clean)
#covid_clean$date = as.Date(covid_clean$date)
#max(covid_clean$date)
#unique(covid_clean$location)

Question 1

performing join

#code for more optimized join (to be pasted later)
#class(covid_clean$date)

# using a copy just in case
covid2 <- covid_clean_AUS

#sp500$Date = mdy(sp500$Date)
#TOPIX$Date = mdy(TOPIX$Date)
ASX200$date = mdy(ASX200$Date)
#NSEI$Date = mdy(NSEI$Date)
#SSEC$Date = mdy(SSEC$Date)

#sp500$date = mdy(sp500$Date)
#TOPIX$date = mdy(TOPIX$Date)
ASX200$date = mdy(ASX200$Date)
#NSEI$date = mdy(NSEI$Date)
#SSEC$date = mdy(SSEC$Date)

covid2$date = ymd(covid2$date)

#Temporarily changing the date to character as joining cannot be done with date objects
#Also selecting relevant columns for analysis later
covid2 <- covid2 %>%
  transform(covid2, date = as.character(date)) %>% 
  select(date, new_cases, new_deaths, location, new_vaccinations, new_tests, population, population_density, total_tests, total_vaccinations, total_cases)

#sp500$Date <- as.character(sp500$Date)
#TOPIX$Date <- as.character(TOPIX$Date)
ASX200$date <- as.character(ASX200$date)
#NSEI$Date <- as.character(NSEI$Date)
#SSEC$Date <- as.character(SSEC$Date)

#sp500$date <- as.character(sp500$Date)
#TOPIX$date <- as.character(TOPIX$Date)
ASX200$date <- as.character(ASX200$date)
#NSEI$date <- as.character(NSEI$Date)
#SSEC$date <- as.character(SSEC$Date)

# renaming column so it has same name as the stock market data frames for joining later
#colnames(covid2)[1] = "Date"

# making data frames for each country we select to perform individual joins on each to their respective stock market index
#covid_US <- covid2 %>% filter(location == "United States")
covid_AUS <- covid2 %>% filter(location == "Australia")
#covid_IND <- covid2 %>% filter(location == "India")
#covid_JPN <- covid2 %>% filter(location == "Japan")
#covid_CHN <- covid2 %>% filter(location == "China")

# performing joins 

#df_1 = inner_join(sp500, covid_US, by = "Date")
#df_2 = inner_join(TOPIX, covid_JPN, by = "Date")
df_3 = inner_join(ASX200, covid_AUS, by = "date")
#df_4 = inner_join(NSEI, covid_IND, by = "Date")
#df_5 = inner_join(SSEC, covid_CHN, by = "Date")

#df_1 = inner_join(sp500, covid_US, by = "date")
#df_2 = inner_join(TOPIX, covid_JPN, by = "date")
df_3 = inner_join(ASX200, covid_AUS, by = "date")
#df_4 = inner_join(NSEI, covid_IND, by = "date")
#df_5 = inner_join(SSEC, covid_CHN, by = "date")


# vertically joined data set (now one column will store all the values of the respective country index)
# e.g US stores prices relevant to S&P500 and China's prices are relevant to the the SSEC which is based in Shanghai.
#covid_joined <- rbind(df_1, df_2, df_3, df_4, df_5)

# Still need transform relevant column to numeric ect...will do a little later 

df_aus <- df_3

#transformation
df_aus$date = as.Date(df_aus$date)
df_aus$Price = as.numeric(gsub(",","",df_aus$Price))

view(df_aus)

Initial analysis for new_cases_smoothed

covid_temp <- covid_clean
covid_temp$month <- strftime(covid_temp$date, "%m")
covid_temp$year  <- strftime(covid_temp$date, "%Y")

covid_temp_new_case_aggregate <- aggregate(new_cases_smoothed~month+year, 
                  covid_temp,
                  FUN = mean)
covid_temp_new_case_aggregate$month_year <- paste(covid_temp_new_case_aggregate$month, covid_temp_new_case_aggregate$year)
ggplot(covid_temp_new_case_aggregate, aes(x= new_cases_smoothed, y= month_year)) +
  geom_bar(stat = 'identity') + ggtitle("Average New Cases (Smoothed) per Month") +
  ylab("month year") + xlab("average new cases")
  • Analysis: From the above bar plot, we set different months of 2021 and 2022 as y-axis and the average smoothed new cases for each month as x-axis. We can easily know that smoothed new cases in most months are less than 30,000 except in Jan, 2022, which is nearly 50,000. So the data of new_cases_smoothed is evenly distributed and has a small standard deviation.

Initial analysis for new_people_vaccinated_smoothed

covid_temp = covid_clean %>% select(date,new_people_vaccinated_smoothed)
covid_temp$date <- as.Date(covid_temp$date, format = "%Y-%m-%d")
covid_temp_new_vaccinated_aggregate = covid_temp %>% mutate(month_year = as.character(format(date, "%m-%Y"))) %>%
          group_by(month_year) %>%
  summarise(date=date[1], number = mean(new_people_vaccinated_smoothed))
covid_temp_new_vaccinated_aggregate  
ggplot(covid_temp_new_vaccinated_aggregate, aes(x = number, y= month_year)) +
  geom_bar(stat = 'identity') + ggtitle("Average New People Vaccinated (Smoothed) per Month") +
  ylab("month year") + xlab("average new people vaccinated")
  • Analysis: Most number show above are larger than 100000. Many people get vaccinated in July, August and September in 2021, and less people get vaccinated before and after that period which is reasonable since it takes time to invent and promote new vaccines.

Q2 a

df_aus = df_aus[order(as.Date(df_aus$date, format="%d/%m/%Y")),]

colnames(df_aus)[7] = 'Change'

#write.csv(df_aus,"df_aus.csv")

Price ~ new_cases

#glimpse(df_aus)
df_aus_subset = df_aus %>% select(Price, new_cases)
view(df_aus_subset)

M0 = lm(Price ~ new_cases, data = df_aus_subset) # Null model
summary(M0)
## 
## Call:
## lm(formula = Price ~ new_cases, data = df_aus_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -549.62 -157.03   60.97  163.28  405.41 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.223e+03  1.525e+01 473.504   <2e-16 ***
## new_cases   1.166e-03  5.762e-04   2.024    0.044 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 226.3 on 262 degrees of freedom
## Multiple R-squared:  0.0154, Adjusted R-squared:  0.01164 
## F-statistic: 4.097 on 1 and 262 DF,  p-value: 0.04396
  • Analysis(Paul and Christin): We build a linear model here to explore relationship between stock prices and new cases of covid-19. The dependent variable here is stock prices while independent variable is new cases. Although the r-squared value is quite low and Residual standard error is a bit high, which indicates our model is not good, the p-value for new cases is 0.044( < 0.05), so it is a significantly influence Price. The final model we get is \(\text{Price} = 1.166e^{-03}\times \text{new_cases} + 7.223e^{03}\).

Scatter Plot for price and new_cases

y <- df_aus$Price
x <- df_aus$new_cases

plot(x, y, main = "Price ~ New Cases",
     ylab = "Price", xlab = "new_cases",
     pch = 19, frame = FALSE)
# Add regression line
plot(x, y, main = "Price ~ New Cases",
     ylab = "Price", xlab = "new_cases",
     pch = 19, frame = FALSE)
abline(lm(y ~ x, data = df_aus), col = "blue")

Price ~ new_vaccinations

# df_aus_subset = df_aus %>% select(Price, new_deaths) %>% filter(new_deaths != 0)

New deaths and prices

df_aus_subset = df_aus %>% select(Price, new_deaths, new_vaccinations, new_cases)
view(df_aus_subset)

M1 = lm(Price ~ poly(new_vaccinations, degree=2), data=df_aus_subset)
summary(M1)
## 
## Call:
## lm(formula = Price ~ poly(new_vaccinations, degree = 2), data = df_aus_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -555.39  -72.58   -0.10   77.58  697.87 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                          7235.494      8.446  856.70   <2e-16 ***
## poly(new_vaccinations, degree = 2)1  2467.685    137.228   17.98   <2e-16 ***
## poly(new_vaccinations, degree = 2)2 -1620.816    137.228  -11.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 137.2 on 261 degrees of freedom
## Multiple R-squared:  0.6394, Adjusted R-squared:  0.6367 
## F-statistic: 231.4 on 2 and 261 DF,  p-value: < 2.2e-16
ggplot(data = df_aus_subset, mapping = aes(x = Price, y = new_deaths)) + geom_point() + ggtitle("Prices of stocks and against new death") + ylab("New deaths") + xlab("Price")

Price ~ new_vaccinations

ggplot(data = df_aus, mapping = aes(x = new_vaccinations, y = Price)) + geom_point() + ggtitle("Prices of stocks against new vaccinations") + xlab("New vaccinations") + ylab("Price")

Price ~ New tests

df_aus_subset = df_aus %>% select(Price, new_tests)
view(df_aus_subset)
M2 = lm(Price ~ poly(new_tests, degree=2), data=df_aus_subset)
summary(M2)
## 
## Call:
## lm(formula = Price ~ poly(new_tests, degree = 2), data = df_aus_subset)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -469.40  -88.38   12.24   86.71  401.48 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   7235.494      8.392 862.236  < 2e-16 ***
## poly(new_tests, degree = 2)1  2767.837    136.347  20.300  < 2e-16 ***
## poly(new_tests, degree = 2)2 -1057.592    136.347  -7.757 1.97e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 136.3 on 261 degrees of freedom
## Multiple R-squared:  0.6441, Adjusted R-squared:  0.6413 
## F-statistic: 236.1 on 2 and 261 DF,  p-value: < 2.2e-16
ggplot(data = df_aus, mapping = aes(x = new_tests, y = Price)) + geom_point() + ggtitle("Prices of stocks against new tests") + xlab("New tests") + ylab("Price")

ggplot(data = df_aus, mapping = aes(x = Price, y = population)) + geom_point() + ggtitle("Prices of stocks against population") + ylab("New tests") + xlab("Price")
ggplot(data = df_aus, mapping = aes(x = Price, y = population_density)) + geom_point() + ggtitle("Prices of stocks against population density") + ylab("New tests") + xlab("Price")
ggplot(data = df_aus, mapping = aes(x = date, y = population_density)) + geom_line() + ggtitle("Prices of stocks against population density") + ylab("New tests") + xlab("Price") # bad
summary(df_aus$Price)
df_aus$Price_mul30 = df_aus$Price*30
p1 = ggplot(data = df_aus) + geom_line(aes(x=date, y = new_cases), color = "red") + geom_line(aes(x=date, y = new_vaccinations), color = "light green") + geom_line(aes(x=date, y = new_tests), color = "light blue") +
  geom_line(aes(x=date, y = Price_mul30), color = "blue") + theme_bw()

df_aus$Price_mul30 = df_aus$Price*30
p1 = ggplot(data = df_aus) + geom_line(aes(x=date, y = new_cases), color = "red") + geom_line(aes(x=date, y = new_vaccinations), color = "light green") + geom_line(aes(x=date, y = new_tests), color = "light blue")+ theme_bw()
ggplotly(p1)
autoplot(M0, which = 1:2)
autoplot(M1, which = 1:2)
autoplot(M2, which = 1:2)
M = lm(Price ~ polym(new_tests, new_vaccinations, degree=2, raw=TRUE), data=df_aus)
summary(M)
## 
## Call:
## lm(formula = Price ~ polym(new_tests, new_vaccinations, degree = 2, 
##     raw = TRUE), data = df_aus)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -484.56  -59.44    7.11   68.74  320.66 
## 
## Coefficients:
##                                                                 Estimate
## (Intercept)                                                    6.652e+03
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.0  2.773e-03
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)2.0 -2.754e-09
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.1  3.958e-03
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.1 -2.408e-09
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.2 -8.281e-09
##                                                               Std. Error
## (Intercept)                                                    2.498e+01
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.0  4.246e-04
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)2.0  1.455e-09
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.1  3.417e-04
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.1  1.588e-09
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.2  9.333e-10
##                                                               t value Pr(>|t|)
## (Intercept)                                                   266.296  < 2e-16
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.0   6.531 3.45e-10
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)2.0  -1.892   0.0596
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.1  11.585  < 2e-16
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.1  -1.516   0.1307
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.2  -8.873  < 2e-16
##                                                                  
## (Intercept)                                                   ***
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.0 ***
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)2.0 .  
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.1 ***
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)1.1    
## polym(new_tests, new_vaccinations, degree = 2, raw = TRUE)0.2 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 110.5 on 258 degrees of freedom
## Multiple R-squared:  0.7688, Adjusted R-squared:  0.7643 
## F-statistic: 171.5 on 5 and 258 DF,  p-value: < 2.2e-16
ggplot(data = df_aus) + geom_point(aes(x = new_tests, y = Price), color = "light blue") +
  geom_point(aes(x = new_vaccinations, y = Price), color = "light green") +
  ggtitle("Prices of stocks against new tests and new_vaccinations") + ylab("Price") +theme_classic()

  • The behaviour of a dependent variable is explained by a linear, or curvilinear, additive relationship between the dependent variable and a set of k independent variables (xi, i=1 to k).
  • The relationship between the dependent variable and any independent variable is linear or curvilinear.
  • The independent variables do no depend on each other too.
  • The errors are independent, normally distributed with mean zero and a constant variance.

Log Price vs grwth rate

Calculate grwth rate

covid_clean_AUS <- df_aus
percentage_tests <- (covid_clean_AUS$new_tests/ covid_clean_AUS$total_tests)*100
vec_percentage_tests <- c(percentage_tests)
covid_clean_AUS <-cbind(covid_clean_AUS, perc_new_tests = vec_percentage_tests)

covid_clean_AUS <- covid_clean_AUS
percentage_cases <- (covid_clean_AUS$new_cases/ covid_clean_AUS$total_cases)*100
vec_percentage_cases <- c(percentage_cases)
covid_clean_AUS <-cbind(covid_clean_AUS, perc_new_cases = vec_percentage_cases)

covid_clean_AUS <- covid_clean_AUS
percentage_vacc <- (covid_clean_AUS$new_vaccinations/ covid_clean_AUS$total_vaccinations)*100
vec_percentage_vacc <- c(percentage_vacc)
covid_clean_AUS <-cbind(covid_clean_AUS, perc_new_vacc = vec_percentage_vacc)
df_aus_Q2 <- covid_clean_AUS

df_aus_Q2$lprice = log(df_aus_Q2$Price)
view(df_aus_Q2)

Visulization

ggplot(data = df_aus_Q2) + geom_point(aes(x = perc_new_tests, y = lprice), color = "light blue") +
  geom_point(aes(x = perc_new_vacc, y = lprice), color = "light green") +
  geom_point(aes(x = perc_new_cases, y = lprice), color = "red") +
  ggtitle("Prices of stocks againsts growth rate of new cases, new vaccinations and new tests") + xlab("growth rate") + ylab("log(Price)") +theme_classic()

Very bad plot….